/*
Copyright (c) 1990 The Regents of the University of California.
All rights reserved.

Redistribution and use in source and binary forms are permitted
provided that the above copyright notice and this paragraph are
duplicated in all such forms and that any documentation,
and/or other materials related to such
distribution and use acknowledge that the software was developed
by the University of California, Berkeley.  The name of the
University may not be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
 */
#include <picolibc.h>

#include "setarch.h"

#include "defines.h"

#ifdef __H8300SX__

	.global _memcpy
_memcpy:
	stm.l	er4-er6,@-er7

	; Set up source and destination pointers for movmd.
	mov.l	er0,er6
	mov.l	er1,er5

	; See whether the copy is long enough to use the movmd.l code.
	; Although the code can handle anything longer than 6 bytes,
	; it can be more expensive than movmd.b for small moves.
	; It's better to use a higher threshold to account for this.
	;
	; Note that the exact overhead of the movmd.l checks depends on
	; the alignments of the length and pointers.  They are faster when
	; er0 & 3 == er1 & 3 == er2 & 3, faster still when these values
	; are 0.  This threshold is a compromise between the various cases.
	cmp	#16,LEN(r2)
	blo	simple

	; movmd.l only works for even addresses.  If one of the addresses
	; is odd and the other is not, fall back on a simple move.
	bld	#0,r5l
	bxor	#0,r6l
	bcs	simple

	; Make the addresses even.
	bld	#0,r5l
	bcc	word_aligned
	mov.b	@er5+,@er6+
	sub	#1,LEN(r2)

word_aligned:
	; See if copying one word would make the first operand longword
	; aligned.  Although this is only really worthwhile if it aligns
	; the second operand as well, it's no worse if doesn't, so it
	; hardly seems worth the overhead of a "band" check.
	bld	#1,r6l
	bcc	fast_copy
	mov.w	@er5+,@er6+
	sub	#2,LEN(r2)

fast_copy:
	; Set (e)r4 to the number of longwords to copy.
	mov	LEN(r2),LEN(r4)
	shlr	#2,LEN(r4)

#ifdef __NORMAL_MODE__
	; 16-bit pointers and size_ts: one movmd.l is enough.  This code
	; is never reached with r4 == 0.
	movmd.l
	and.w	#3,r2
simple:
	mov.w	r2,r4
	beq	quit
	movmd.b
quit:
	rts/l	er4-er6
#else
	; Skip the first iteration if the number of longwords is divisible
	; by 0x10000.
	mov.w	r4,r4
	beq	fast_loop_next

	; This loop copies r4 (!= 0) longwords the first time round and 65536
	; longwords on each iteration after that.
fast_loop:
	movmd.l
fast_loop_next:
	sub.w	#1,e4
	bhs	fast_loop

	; Mop up any left-over bytes.  We could just fall through to the
	; simple code after the "and" but the version below is quicker
	; and only takes 10 more bytes.
	and.w	#3,r2
	beq	quit
	mov.w	r2,r4
	movmd.b
quit:
	rts/l	er4-er6

simple:
	; Simple bytewise copy.  We need to handle all lengths, including zero.
	mov.w	r2,r4
	beq	simple_loop_next
simple_loop:
	movmd.b
simple_loop_next:
	sub.w	#1,e2
	bhs	simple_loop
	rts/l	er4-er6
#endif

#else

	.global _memcpy
_memcpy:
;	MOVP	@(2/4,r7),A0P	; dst
;	MOVP	@(4/8,r7),A1P	; src
;	MOVP	@(6/12,r7),A2P	; len

	MOVP	A0P,A3P	; keep copy of final dst
	ADDP	A2P,A0P	; point to end of dst
	CMPP	A0P,A3P	; see if anything to do
	beq	quit

	ADDP	A2P,A1P	; point to end of src

	; lets see if we can do this in words
	or	A0L,A2L	; or in the dst address
	or	A3L,A2L	; or the length 
	or	A1L,A2L	; or the src address
	btst	#0,A2L	; see if the lsb is zero
	bne	byteloop

wordloop:
#ifdef __NORMAL_MODE__
	sub	#2,A1P
#else
	subs	#2,A1P		; point to word
#endif
	mov.w	@A1P,A2		; get word
	mov.w	A2,@-A0P	; save word
	CMPP	A0P,A3P		; at the front again ?
	bne 	wordloop
	rts

byteloop:
#ifdef __NORMAL_MODE__
	sub	#1,A1P
#else
	subs	#1,A1P		; point to byte
#endif
	mov.b	@A1P,A2L	; get byte
	mov.b	A2L,@-A0P	; save byte
	CMPP	A0P,A3P 	; at the front again ?
	bne 	byteloop

	; return with A0 pointing to dst
quit:	rts

#endif
